{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch09_sampling.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 9 (sampling)\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "yeVh6hm2ezCO"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "aErGXtnTezCP"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Cay5D0ZSetu4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.1: Sampling variability in random data"
   ],
   "metadata": {
    "id": "FTWyQLKcetsQ"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 500\n",
    "nSamples = 50\n",
    "kHistBins = 20\n",
    "\n",
    "# bins for histograms\n",
    "edges = np.linspace(-3,3,kHistBins+1)\n",
    "\n",
    "# declare matrices\n",
    "allHistY = np.zeros((nSamples,kHistBins))\n",
    "allMeans = np.zeros(nSamples)\n",
    "\n",
    "\n",
    "# setup figure\n",
    "f,axs = plt.subplots(2,1,figsize=(7,6))\n",
    "\n",
    "for sampi in range(nSamples):\n",
    "\n",
    "  # create data (parameters don't chage!)\n",
    "  data = np.random.normal(loc=0,scale=1,size=N)\n",
    "\n",
    "  # histograms\n",
    "  y,x = np.histogram(data,bins=edges)\n",
    "  allHistY[sampi,:] = y\n",
    "\n",
    "  # get means\n",
    "  allMeans[sampi] = np.mean(data)\n",
    "\n",
    "  # plot\n",
    "  c = np.random.uniform(low=.5,high=.9)\n",
    "  axs[0].plot((x[:-1]+x[1:])/2,y,'o',color=(c,c,c))\n",
    "\n",
    "  axs[1].plot(sampi,np.mean(data),'ks',markersize=12,markerfacecolor=(c,c,c))\n",
    "\n",
    "\n",
    "# plot the average histogram\n",
    "axs[0].plot((x[:-1]+x[1:])/2,np.mean(allHistY,axis=0),'k',linewidth=3)\n",
    "\n",
    "# plot the means, and the mean of the means\n",
    "axs[1].axhline(np.mean(allMeans),linestyle='--',color='k',zorder=-1)\n",
    "\n",
    "# make the plots look nicer\n",
    "axs[0].set(xlabel='Data value',ylabel='Count')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Histograms of each sample')\n",
    "axs[1].set(xlabel='Sample number',ylabel='Sample mean')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Scatter of sample means (mean=%.3g)' %np.mean(allMeans))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_exampleWithRandom.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "o8uj0oCmfLiD"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "6kPB0tCW21JC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.2: Samples and variability of sample means"
   ],
   "metadata": {
    "id": "-avv2e6R6dFQ"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# number of samples\n",
    "nSamples = 50\n",
    "\n",
    "# histogram resolution\n",
    "k = 30\n",
    "edges = np.linspace(-3,14,31)\n",
    "xx = (edges[:-1]+edges[1:])/2\n",
    "\n",
    "# initialize output matrices\n",
    "meenz  = np.zeros(nSamples) # sample averages\n",
    "allYYs = np.zeros(k)        # average of histograms\n",
    "\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(4,6))\n",
    "\n",
    "# loop over samples\n",
    "for i in range(nSamples):\n",
    "\n",
    "  # generate random data from an exGaussian distribution\n",
    "  randomX = stats.exponnorm.rvs(np.random.uniform(low=.1,high=5),size=2000)\n",
    "\n",
    "  # get its histogram and normalize\n",
    "  yy,_ = np.histogram(randomX,bins=edges)\n",
    "  yy = yy/np.sum(yy)\n",
    "\n",
    "  # average the distributions\n",
    "  allYYs += yy\n",
    "\n",
    "  # store the average of the distribution\n",
    "  meenz[i] = np.mean(randomX)\n",
    "\n",
    "  # plot the line\n",
    "  rc = np.random.uniform(low=.4,high=.8) # random color\n",
    "  axs[0].plot(xx,yy,linewidth=.5,color=(rc,rc,rc))\n",
    "  axs[0].plot(meenz[i],yy[np.argmin(np.abs(xx-meenz[i]))],'k*',linewidth=.2,\n",
    "              markerfacecolor=(rc,rc,rc),markersize=12)\n",
    "\n",
    "\n",
    "# some plotting adjustments\n",
    "axs[0].set(xlim=xx[[0,-1]],ylabel='Probability',yticks=[])\n",
    "axs[0].set_title(r'$\\bf{B}$)  Data distributions')\n",
    "\n",
    "\n",
    "## the distribution of sample means\n",
    "axs[1].hist(meenz,20,facecolor=(.7,.7,.7),edgecolor='k')\n",
    "axs[1].plot(xx,allYYs/np.max(allYYs)*5,'k',zorder=-10)\n",
    "axs[1].set(xlim=xx[[0,-1]],xlabel='Data value',ylabel='Count',yticks=[])\n",
    "axs[1].set_title(r'$\\bf{C}$)  Means distribution')\n",
    "\n",
    "# output\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_distOfExGausMeans.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "RqUOnhkJ21Gg"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "e_NoEZ2oX5iF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.3: Law of Large Numbers (demo 1)"
   ],
   "metadata": {
    "id": "CU57MgThK8N0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate \"population\"\n",
    "population = [ 1, 2, 3, 4 ]\n",
    "for i in range(20):\n",
    "    population = np.hstack((population,population))\n",
    "\n",
    "nPop = len(population)\n",
    "expval = np.mean(population)\n",
    "print(f'Expected value (population mean): {expval}')\n",
    "print(f'Population size: {nPop}')"
   ],
   "metadata": {
    "id": "VH16IVrTxbY_"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "## experiment: draw larger and larger samples\n",
    "\n",
    "k = 1500  # maximum number of samples\n",
    "sampleAves = np.zeros(k)\n",
    "\n",
    "for i in range(k):\n",
    "  # get a sample\n",
    "  sample = np.random.choice(population,size=i+1)\n",
    "\n",
    "  # compute and store its mean\n",
    "  sampleAves[i] = np.mean( sample )\n",
    "\n",
    "\n",
    "# visualize!\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(sampleAves,'s',markerfacecolor=(.9,.9,.9),color=(.6,.6,.6))\n",
    "plt.plot([1,k],[expval,expval],'k',linewidth=4)\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel('Value')\n",
    "plt.xlim([-20,k+20])\n",
    "plt.ylim([np.min(sampleAves)*.85,1.05*np.max(sampleAves)])\n",
    "plt.legend(('Sample average','Population average'))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_LLNdemo1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Yy9hDZ3QxbcC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "9lzf7ATRlLoN"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.4: Law of Large Numbers (demo 2)"
   ],
   "metadata": {
    "id": "9DzuokM0lLln"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters and initializations\n",
    "samplesize   = 30\n",
    "numberOfExps = 50\n",
    "samplemeans  = np.zeros(numberOfExps)\n",
    "\n",
    "# run the experiment!\n",
    "for expi in range(numberOfExps):\n",
    "  # compute and store its mean\n",
    "  samplemeans[expi] = np.mean( np.random.choice(population,size=samplesize) )\n",
    "\n",
    "\n",
    "# show the results\n",
    "fig,ax = plt.subplots(2,1,figsize=(7,5))\n",
    "\n",
    "# each individual sample mean\n",
    "ax[0].plot(samplemeans,'s',markerfacecolor=(.9,.9,.9),color=(.6,.6,.6))\n",
    "ax[0].set_title(r'$\\bf{A}$)  Each sample mean')\n",
    "ax[0].set_xlabel('Sample number (s)')\n",
    "\n",
    "# cumulative average over the samples\n",
    "ax[1].plot(np.cumsum(samplemeans) / np.arange(1,numberOfExps+1),\n",
    "           's',markerfacecolor=(.9,.9,.9),color=(.6,.6,.6))\n",
    "ax[1].set_title(r'$\\bf{B}$)  Cumulative sample means')\n",
    "\n",
    "# multiline xtick labels\n",
    "xticks = np.arange(0,51,10)\n",
    "ax[1].set_xticks(xticks,labels=[f's={i}\\nN={i*samplesize}' for i in xticks])\n",
    "\n",
    "\n",
    "# common axis modifications\n",
    "for a in ax:\n",
    "  a.plot([0,numberOfExps],[np.mean(population),np.mean(population)],'k')\n",
    "  a.set(ylabel='Mean value',xlim=[-.5,numberOfExps+.5],\n",
    "        ylim=[np.min(samplemeans)*.85,1.1*np.max(samplemeans)])\n",
    "\n",
    "totSS = np.arange(1,numberOfExps+1)*samplesize\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_LLNdemo2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "phH386r5xbe4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "VUraD4awevGT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.5: Visualization of sample mean variability"
   ],
   "metadata": {
    "id": "fVXUFw7jevDj"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 512\n",
    "X = np.random.rand(N,2)\n",
    "\n",
    "\n",
    "sample1 = X[np.random.choice(N,size=40),:]\n",
    "sample2 = X[np.random.choice(N,size=40),:]\n",
    "\n",
    "plt.figure(figsize=(8,6))\n",
    "\n",
    "# plot all data points\n",
    "plt.plot(X[:,0],X[:,1],'s',color=(.7,.7,.7),markerfacecolor='w',markersize=10)\n",
    "\n",
    "# plot sample data\n",
    "plt.plot(sample1[:,0],sample1[:,1],'bo')\n",
    "plt.plot(sample2[:,0],sample2[:,1],'r^')\n",
    "\n",
    "# plot sample means\n",
    "plt.plot(np.mean(sample1[:,0]),np.mean(sample1[:,1]),'bo',markersize=15)\n",
    "plt.plot(np.mean(sample2[:,0]),np.mean(sample2[:,1]),'r^',markersize=15)\n",
    "\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.xlabel('Variable 1')\n",
    "plt.ylabel('Variable 2')\n",
    "plt.legend(['All data','Sample 1','Sample 2','Mean s1','Mean s2'],\n",
    "           bbox_to_anchor=[1,1.02])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_meanOfSamplesGeom.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "bJGq72rjez_s"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Csft8co1e0CR"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.6: Sample biases can be overcome by LLN (given some assumptions)"
   ],
   "metadata": {
    "id": "ImvujL6nDHLz"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 512\n",
    "X = np.random.rand(N,2)\n",
    "\n",
    "# nonrandom sorting\n",
    "X = X[np.argsort(np.sum(X**2,axis=1)),:]\n",
    "\n",
    "\n",
    "# plot all data points\n",
    "plt.figure(figsize=(8,6))\n",
    "plt.plot(X[:,0],X[:,1],'s',color=(.7,.7,.7),markerfacecolor='w',markersize=10)\n",
    "\n",
    "\n",
    "# nonrandom sampling to simulate a bias\n",
    "sampmeans = np.zeros((6,2)) # hard-coded to 6 samples...\n",
    "sampbias = np.linspace(20,N-40,6,dtype=int)\n",
    "shapes = 'o^d*XP'\n",
    "colors = 'brmkgc'\n",
    "for si in range(6):\n",
    "\n",
    "  # biased sample and its mean\n",
    "  sample = X[sampbias[si]:sampbias[si]+40,:]\n",
    "  sampmeans[si,:] = np.mean(sample,axis=0)\n",
    "\n",
    "  # plot samples\n",
    "  plt.plot(sample[:,0],sample[:,1],shapes[si],color=colors[si],\n",
    "           markerfacecolor=colors[si],label=f'Sample {si+1}')\n",
    "\n",
    "  # plot sample mean\n",
    "  plt.plot(sampmeans[si,0],sampmeans[si,1],shapes[si],color='k',\n",
    "           markerfacecolor=colors[si],markersize=15)\n",
    "\n",
    "\n",
    "\n",
    "# plot the average of sample means\n",
    "plt.plot(np.mean(sampmeans[:,0]),np.mean(sampmeans[:,1]),'ko',\n",
    "         markerfacecolor='k',markersize=20,label='Average of means')\n",
    "\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.xlabel('Variable 1')\n",
    "plt.ylabel('Variable 2')\n",
    "plt.legend(bbox_to_anchor=[1,1.02])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_meanOfSamplesGeom_biased.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "PkBg52Kze0FF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "aHT4UgUVxbhv"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.7: CLT, demo 1"
   ],
   "metadata": {
    "id": "V9ZYGy7Uxbko"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate \"population\" (simulating a weighted die)\n",
    "population = [ 1, 1, 2, 2, 3, 4, 5, 6 ]\n",
    "for i in range(20):\n",
    "    population = np.hstack((population,population))\n",
    "\n",
    "nPop = len(population)\n",
    "expval = np.mean(population)\n",
    "print(f'Expected value (population mean): {expval}')\n",
    "print(f'Population size: {nPop}')"
   ],
   "metadata": {
    "id": "NCG6oqzxDMRo"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters and initializations\n",
    "samplesize   = 30\n",
    "numberOfExps = 500\n",
    "samplemeans  = np.zeros(numberOfExps)\n",
    "\n",
    "# run the experiment!\n",
    "for expi in range(numberOfExps):\n",
    "  # compute and store its mean\n",
    "  samplemeans[expi] = np.mean( np.random.choice(population,size=samplesize) )\n",
    "\n",
    "\n",
    "# show the results\n",
    "fig,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "# histogram of the data\n",
    "axs[0].hist(population,bins=np.arange(.5,7.5,step=1),color=[.8,.8,.8],edgecolor='k')\n",
    "axs[0].set(xticks=range(1,7),xlabel='Die face',ylabel='Count')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Distribution of population data')\n",
    "axs[0].ticklabel_format(style='plain')\n",
    "\n",
    "# histogram of the sample means\n",
    "axs[1].hist(samplemeans,bins='fd',color=[.8,.8,.8],edgecolor='k')\n",
    "axs[1].axvline(expval,linewidth=3,color='k',linestyle='--')\n",
    "axs[1].set(xlabel='Sample mean',ylabel='Count')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Distribution of sample means')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_CLTdemo1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "vpqmwv4Iq1HS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "68sbxpKexbnV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.8: CLT, demo 2"
   ],
   "metadata": {
    "id": "OkiIZBhOxbpw"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# new population!\n",
    "Npop = 1000000\n",
    "population = np.random.randn(Npop)**2\n",
    "\n",
    "\n",
    "# parameters and initializations\n",
    "samplesize   =  30\n",
    "numberOfExps = 500\n",
    "samplemeans  = np.zeros(numberOfExps)\n",
    "\n",
    "# run the experiment!\n",
    "for expi in range(numberOfExps):\n",
    "  # compute and store its mean\n",
    "  samplemeans[expi] = np.mean( np.random.choice(population,size=samplesize) )\n",
    "\n",
    "\n",
    "# show the results\n",
    "fig,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "# histogram of the data\n",
    "axs[0].hist(population,bins=50,color=[.8,.8,.8],edgecolor='k')\n",
    "axs[0].set(xlabel='Data value',ylabel='Count')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Distribution of population data')\n",
    "axs[0].ticklabel_format(style='plain')\n",
    "\n",
    "# histogram of the sample means\n",
    "axs[1].hist(samplemeans,bins='fd',color=[.8,.8,.8],edgecolor='k')\n",
    "axs[1].set(xlabel='Sample mean',ylabel='Count',xlim=[0,4])\n",
    "axs[1].axvline(np.mean(population),linewidth=3,color='k',linestyle='--')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Distribution of sample means')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_CLTdemo2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "aLkgjAn_8rAn"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "iwIs3oby8rDN"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.9: CLT, demo 3"
   ],
   "metadata": {
    "id": "hT4KOPJ9K8Qh"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create two data variables\n",
    "x = np.linspace(0,6*np.pi,10000)\n",
    "s = np.sin(x)\n",
    "u = 2*np.random.rand(len(x))-1\n",
    "\n",
    "# combine them into a list for convenience\n",
    "datasets = [ s,u,s+u ]\n",
    "axislets = iter('ABCDEF') # axis labels\n",
    "\n",
    "\n",
    "# plot!\n",
    "_,axs = plt.subplots(3,2,figsize=(7,6))\n",
    "\n",
    "for i in range(3):\n",
    "\n",
    "  # axis variable label\n",
    "  dlab = str(i+1) if i<2 else '1+2'\n",
    "\n",
    "  # plot the data\n",
    "  axs[i,0].plot(x,datasets[i],'k.',markersize=1)\n",
    "  axs[i,0].set_title(r'$\\bf{%s}$)  Data %s' %(next(axislets),dlab))\n",
    "\n",
    "  # plot the histogram\n",
    "  axs[i,1].hist(datasets[i],bins=200,color='k',edgecolor=None)\n",
    "  axs[i,1].set_title(r'$\\bf{%s}$)  Histogram %s' %(next(axislets),dlab))\n",
    "\n",
    "\n",
    "# adjust the axis properties\n",
    "for a in axs.flatten():\n",
    "  a.xaxis.set_visible(False)\n",
    "  a.spines['bottom'].set_visible(False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_CLTdemo3a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "E7P1CVMEAPqi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "JilCbKOR7Opy"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 9.10: CLT requires comparable scaling"
   ],
   "metadata": {
    "id": "fyFB2W_V7O63"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# only difference from the previous figure is the amplitude-scaling!\n",
    "s = 10*np.sin(x)\n",
    "\n",
    "# combine them into a list for convenience\n",
    "datasets = [ s,u,s+u ]\n",
    "axislets = iter('ABCDEF') # axis labels\n",
    "\n",
    "\n",
    "# plot!\n",
    "_,axs = plt.subplots(3,2,figsize=(7,6))\n",
    "\n",
    "for i in range(3):\n",
    "\n",
    "  # axis variable label\n",
    "  dlab = str(i+1) if i<2 else '1+2'\n",
    "\n",
    "  # plot the data\n",
    "  axs[i,0].plot(x,datasets[i],'k.',markersize=1)\n",
    "  axs[i,0].set_title(r'$\\bf{%s}$)  Data %s' %(next(axislets),dlab))\n",
    "\n",
    "  # plot the histogram\n",
    "  axs[i,1].hist(datasets[i],bins=200,color='k',edgecolor=None)\n",
    "  axs[i,1].set_title(r'$\\bf{%s}$)  Histogram %s' %(next(axislets),dlab))\n",
    "\n",
    "\n",
    "# adjust the axis properties\n",
    "for a in axs.flatten():\n",
    "  a.xaxis.set_visible(False)\n",
    "  a.spines['bottom'].set_visible(False)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_CLTdemo3b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "kL9cTBLiAPoB"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0pmbwKL5APlh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "eraiT6xExAt0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# variance levels (tau^2)\n",
    "tau2levels = np.linspace(.1,10,40)\n",
    "\n",
    "# simulation parameters\n",
    "samplesize = 200\n",
    "numsamples =  20\n",
    "\n",
    "# initialize results matrix\n",
    "results = np.zeros((numsamples,len(tau2levels),2))\n",
    "\n",
    "\n",
    "### run the experiment!\n",
    "# loop over tau levels\n",
    "for ni,tau2 in enumerate(tau2levels):\n",
    "\n",
    "  # repeat for multiple samples\n",
    "  for sampi in range(numsamples):\n",
    "\n",
    "    # generate sample data with tau modulation\n",
    "    data = np.random.normal(0,np.sqrt(tau2),size=samplesize)\n",
    "\n",
    "    # store sample mean and variance\n",
    "    results[sampi,ni,0] = np.mean(data)\n",
    "    results[sampi,ni,1] = np.var(data,ddof=1)\n",
    "\n",
    "\n",
    "### plotting\n",
    "_,axs = plt.subplots(2,1,figsize=(7,7))\n",
    "\n",
    "# plot the average of the sample means\n",
    "axs[0].plot(np.tile(tau2levels,(20,1)),results[:,:,0],'ks',\n",
    "            markerfacecolor=(.6,.6,.6),markersize=8)\n",
    "axs[0].plot(tau2levels,np.mean(results[:,:,0],axis=0),'kd',\n",
    "            markerfacecolor='w',markersize=12)\n",
    "axs[0].set_title(r'$\\bf{A}$)  Sample averages')\n",
    "\n",
    "# plot the average within-sample variances\n",
    "axs[1].plot(tau2levels,np.mean(results[:,:,1],axis=0),'k^',\n",
    "            markerfacecolor=(.7,.7,.7),markersize=8,label='Average variances')\n",
    "\n",
    "# plot the average across-sample variance of the sample means\n",
    "axs[1].plot(tau2levels,np.var(results[:,:,0],axis=0,ddof=1),'ko',\n",
    "            markerfacecolor=(.8,.8,.8),markersize=8,label='Variance of averages')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Sample variances')\n",
    "axs[1].legend()\n",
    "\n",
    "for a in axs:\n",
    "  a.set(xlabel=r'$\\tau^2$',ylabel='Value',\n",
    "        xlim=[tau2levels[0]-.2,tau2levels[-1]+.2])\n",
    "\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "WLdmvsKnxByE"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "TWCQKs6JeA4B"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "OPjG3nODeBZx"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "SEKXY62leBZx"
   },
   "outputs": [],
   "source": [
    "# the sample sizes\n",
    "samplesizes = np.arange(10,1001)\n",
    "\n",
    "# generate population data with known std\n",
    "pop_std     = 2.4\n",
    "populationN = 1000000\n",
    "population  = np.random.randn(populationN)\n",
    "population  = population / np.std(population,ddof=1) # force std=1\n",
    "population  = population * pop_std # force std\n",
    "\n",
    "\n",
    "# initialize results matrix\n",
    "samplestds = np.zeros(len(samplesizes))\n",
    "\n",
    "# run the experiment!\n",
    "for sampi in range(len(samplesizes)):\n",
    "\n",
    "  # pick a random sample\n",
    "  sample = np.random.choice(population,size=samplesizes[sampi])\n",
    "  samplestds[sampi] = np.std(sample,ddof=1)\n",
    "\n",
    "\n",
    "# show the results!\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(samplesizes,samplestds,'s',markerfacecolor=(.9,.9,.9),color=(.6,.6,.6))\n",
    "plt.axhline(pop_std,color='k',linewidth=3)\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel('Standard deviation value')\n",
    "plt.xlim([samplesizes[0]-7,samplesizes[-1]+7])\n",
    "plt.legend(('Sample stds','Population std'))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex2.png')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# Note about the data-generation method:\n",
    "# It is not sufficient to use np.random.normal(0,2.4,N), because that does\n",
    "# not guarantee a *population* standard deviation of 2.4. Instead, it is\n",
    "# necessary to force the std by first scaling to std=1 and then multiplying.\n",
    "\n",
    "# Here's a demonstration:\n",
    "print(np.std(np.random.normal(0,pop_std,populationN),ddof=1))\n",
    "print(np.std(population,ddof=1))"
   ],
   "metadata": {
    "id": "8tq74ahGSBFz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "thqJYN7ReA7j"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "nSUtGj25eA-N"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "popMean1 = 3\n",
    "popMean2 = 3.2\n",
    "\n",
    "\n",
    "# generate the populations\n",
    "population1 = np.random.randn(populationN)\n",
    "population1 = population1 - np.mean(population1) + popMean1\n",
    "\n",
    "population2 = np.random.randn(populationN)\n",
    "population2 = population2 - np.mean(population2) + popMean2\n",
    "\n",
    "# one sample\n",
    "s1 = np.mean( np.random.choice(population1,size=30) )\n",
    "s2 = np.mean( np.random.choice(population2,size=30) )\n",
    "\n",
    "print(f'Population difference: {popMean1-popMean2:.3f}')\n",
    "print(f'Sample difference:     {s1-s2:.3f}')"
   ],
   "metadata": {
    "id": "1hChAfLAeBBA"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# initialize results matrix\n",
    "samplediffs = np.zeros(len(samplesizes))\n",
    "\n",
    "# run the experiment!\n",
    "for sampi in range(len(samplesizes)):\n",
    "\n",
    "  # pick a random sample\n",
    "  s1 = np.random.choice(population1,size=samplesizes[sampi])\n",
    "  s2 = np.random.choice(population2,size=samplesizes[sampi])\n",
    "  samplediffs[sampi] = np.mean(s1) - np.mean(s2)\n",
    "\n",
    "\n",
    "# show the results!\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(samplesizes,samplediffs,'s',markerfacecolor=(.9,.9,.9),color=(.6,.6,.6))\n",
    "plt.plot(samplesizes[[0,-1]],[popMean1-popMean2,popMean1-popMean2],'k',linewidth=3)\n",
    "plt.plot(samplesizes[[0,-1]],[0,0],color=(.7,.7,.7),linestyle=':')\n",
    "\n",
    "plt.xlabel('Sample size')\n",
    "plt.ylabel('Sample differences')\n",
    "plt.xlim([samplesizes[0]-7,samplesizes[-1]+7])\n",
    "plt.legend(('Sample diffs','Population diff'))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "3_kFIR12fUXT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "yhAHlEe0eBDr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "xljzr0cMetie"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 1200\n",
    "numbers = np.zeros((N,3))\n",
    "\n",
    "for i in range(N):\n",
    "  nums = np.random.choice(range(100),2)\n",
    "  numbers[i,0] = nums[0]\n",
    "  numbers[i,1] = nums[1]\n",
    "  numbers[i,2] = np.mean(nums)\n",
    "\n",
    "\n",
    "# show the histograms\n",
    "_,axs = plt.subplots(1,3,figsize=(10,np.pi))\n",
    "for i in range(3):\n",
    "  axs[i].hist(numbers[:,i],bins='fd',color=(.3,.3,.3),density=True)\n",
    "  axs[i].set(xlabel='Number',ylabel='Proportion')\n",
    "\n",
    "\n",
    "axs[0].set_title(r'$\\bf{A}$)  First number',loc='left')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Second number',loc='left')\n",
    "axs[2].set_title(r'$\\bf{C}$)  Their average',loc='left')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "qZr1cFfsxXvL"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "ida8atKvxYC1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "719T7ePsxYFm"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# a population of random numbers\n",
    "Npop = 1000000\n",
    "population = np.random.randn(Npop)**2\n",
    "\n",
    "\n",
    "# parameters and initializations\n",
    "samplesizes   = np.arange(5,500,8)\n",
    "numberOfsamps = 1000\n",
    "samplemeans   = np.zeros(numberOfsamps)\n",
    "fwhms         = np.zeros(len(samplesizes))\n",
    "peakvals      = np.zeros(len(samplesizes))\n",
    "\n",
    "# line colors\n",
    "c = np.linspace((.9,.9,.9),(0,0,0),len(samplesizes))\n",
    "\n",
    "\n",
    "\n",
    "## run the experiment!\n",
    "plt.figure(figsize=(7,4))\n",
    "for Ns in range(len(samplesizes)):\n",
    "\n",
    "  # compute the means of lots of samples\n",
    "  for expi in range(numberOfsamps):\n",
    "    samplemeans[expi] = np.mean( np.random.choice(population,size=samplesizes[Ns]) )\n",
    "\n",
    "\n",
    "  # make a histogram of those means\n",
    "  yy,xx = np.histogram(samplemeans,np.linspace(.4,1.6,41))\n",
    "  yy = yy/np.sum(yy)\n",
    "\n",
    "  ### compute FWHM\n",
    "  # step 1: normalize\n",
    "  yn = yy/np.max(yy)\n",
    "\n",
    "  # step 2: find peak index\n",
    "  idx = np.argmax(yn)\n",
    "\n",
    "  # step 3: compute FWHM\n",
    "  fwhms[Ns] = xx[idx-1+np.argmin(np.abs(yn[idx:]-.5))] - xx[np.argmin(np.abs(yn[:idx]-.5))]\n",
    "\n",
    "  # also store mean value\n",
    "  peakvals[Ns] = xx[idx]\n",
    "\n",
    "  # plot\n",
    "  plt.plot((xx[:-1]+xx[1:])/2,yy,color=c[Ns])\n",
    "\n",
    "\n",
    "plt.xlim([.4,1.6])\n",
    "plt.xlabel('Sample mean value')\n",
    "plt.ylabel('Proportion')\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex6a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "J85IeKMZVW2U"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "axs[0].plot(samplesizes,fwhms,'ks',markerfacecolor='w',markersize=8)\n",
    "axs[0].set(xlabel='Sample sizes',ylabel='FWHM',xlim=[-20,samplesizes[-1]+20])\n",
    "axs[0].set_title(r'$\\bf{A}$)  FWHM by sample size')\n",
    "\n",
    "axs[1].plot(samplesizes,peakvals,'ks',markerfacecolor='w',markersize=8)\n",
    "axs[1].set(xlabel='Sample sizes',ylabel='Peak values',ylim=[.5,1.5],xlim=[-20,samplesizes[-1]+20])\n",
    "axs[1].set_title(r'$\\bf{B}$)  Peaks by sample size')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex6b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "TL5EP0kSxYIm"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "yxUclRROxYLm"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7"
   ],
   "metadata": {
    "id": "ULMxg0GsURsz"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# a population of random numbers\n",
    "Npop = 1000000\n",
    "population = np.random.randn(Npop)**2\n",
    "\n",
    "\n",
    "# experiment parameters\n",
    "samplesizes = np.logspace(np.log10(10),np.log10(Npop/10),25,dtype=int)\n",
    "numExps = 50\n",
    "\n",
    "\n",
    "# theoretical standard error based on population standard deviation\n",
    "theory = np.std(population) / np.sqrt(samplesizes)\n",
    "\n",
    "# initialize the empirical estimates\n",
    "standerror = np.zeros((numExps,len(samplesizes)))\n",
    "samplemeans = np.zeros((numExps,len(samplesizes)))\n",
    "\n",
    "\n",
    "# Run the experiment!\n",
    "for expi in range(numExps):\n",
    "  for idx,ssize in enumerate(samplesizes):\n",
    "\n",
    "    # generate a random sample\n",
    "    rsample = np.random.choice(population,size=ssize)\n",
    "\n",
    "    # compute its standard error (estimate) and the sample mean\n",
    "    standerror[expi,idx] = np.std(rsample,ddof=1) / np.sqrt(ssize)\n",
    "    samplemeans[expi,idx] = np.mean(rsample)\n",
    "\n",
    "\n",
    "\n",
    "## plotting\n",
    "_,axs = plt.subplots(1,2,figsize=(9,4))\n",
    "axs[0].plot(samplesizes,theory,'ks-',markersize=10,markerfacecolor=(.7,.7,.7),label='Analytical SEM')\n",
    "axs[0].plot(samplesizes,np.mean(standerror,axis=0),'p-',color=(.3,.3,.3),\n",
    "            markersize=10,markerfacecolor=(0,0,0),label='Empirical SEM')\n",
    "axs[0].plot(samplesizes,np.std(samplemeans,axis=0,ddof=1),'o-',color=(.7,.7,.7),\n",
    "            markersize=10,markerfacecolor=(.9,.9,.9),label='Sample means STD')\n",
    "axs[0].set(xlabel='Sample size',ylabel='Sample means variabilities')\n",
    "axs[0].legend()\n",
    "axs[0].set_title(r'$\\bf{A}$)  Estimates by sample size')\n",
    "\n",
    "axs[1].plot(theory,np.mean(standerror,axis=0),'ko',markerfacecolor=(.8,.8,.8),markersize=12,alpha=.5,label='Empirical SEM')\n",
    "axs[1].plot(np.std(samplemeans,axis=0,ddof=1),np.mean(standerror,axis=0),'ks',markerfacecolor=(.3,.3,.3),markersize=12,alpha=.5,label='Sample mean std')\n",
    "axs[1].set(xlabel='Analytical SEM',ylabel=r'Emp. SEM $or$ means STD')\n",
    "axs[1].legend()\n",
    "axs[1].set_title(r'$\\bf{B}$)  Consistency of estimates')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('sample_ex7.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "hZJc_LgZURpl"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "PIAjlmyLiGq8"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}